import re

import requests
# day1
from fake_useragent import UserAgent

# url = 'https://www.8gdy8.com/index.html'
# response = requests.get(url)
# if response.status_code == 200:
#     info = f"""<p style="overflow: hidden;text-overflow: ellipsis;white-space: nowrap;">
#                     <span class="badge">
#                         *.*?
#                     </span>
#                 <span>(.*?)</span>
#             </p>"""
#     datas = re.findall(info, response.text)
#     print(datas)


# url = 'https://spiderbuf.cn/web-scraping-practice/requests-lxml-for-scraping-beginner'
# response = requests.get(url)
# if response.status_code == 200:
#     html = response.text
#     r = """<tr>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                 </tr>"""
#     items = re.findall(r, html, re.S)
#     print(items)


# url = 'https://spiderbuf.cn/web-scraping-practice/scraper-http-header'
# ua = UserAgent()
# headers = {
#     'User-Agent': ua.random
# }
# response = requests.get(url, headers=headers)
#
# if response.status_code == 200:
#     html = response.text
#     r = """<tr>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                     <td>(.*?)</td>
#                 </tr>"""
#     items = re.findall(r, html, re.S)
#     print(items)



# 图片
# url = 'https://spiderbuf.cn/web-scraping-practice/scraping-images-from-web'
# response = requests.get(url)
# if response.status_code == 200:
#     html = response.text
#     r = """<img src="(.*?)" class="img-responsive img-thumbnail"
#                     alt="python爬取图片">"""
#
#     imgs = re.findall(r, html)
#     for img in imgs:
#         img_url = f'https://spiderbuf.cn{img}'
#         response = requests.get(img_url)
#         with open(f'static/test/{imgs.index(img)}.png', 'wb') as f:
#             f.write(response.content)

# 分页 翻页
url = 'https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper'
ua = UserAgent()
headers = {
    'User-Agent': ua.random
}
response = requests.get(url, headers=headers)
html = response.text
urls = re.findall(r'<li><a href="(.*?)">.*?</a></li>', html)
for url in urls[1:]:
    url = f'https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper{url}'
    ua = UserAgent()
    headers = {
        'User-Agent': ua.random
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        html = response.text
        r = """<tr>
                        <td>(.*?)</td>
                        <td>(.*?)</td>
                        <td>(.*?)</td>
                        <td>(.*?)</td>
                        <td>(.*?)</td>
                        <td>(.*?)</td>
                        <td>(.*?)</td>
                        <td>(.*?)</td>
                    </tr>"""
        items = re.findall(r, html, re.S)
        print(items)